#!/usr/bin/perl
#
# Please refer to the Plain Old Documentation (POD) at the end of this Perl Script for further information
#
# SUBVERSION PROPERTIES
#
# $LastChangedDate: 2010-08-27 08:11:03 +0000 (Fri, 27 Aug 2010) $
# $Rev: 24 $
# $Author: christian.heinrich@owasp.org $
# $URL: http://dic.googlecode.com/svn/tags/stable/dic.pl $
# $Id: dic.pl 24 2010-08-27 08:11:03Z christian.heinrich@owasp.org $

use strict;

# Instructions for CPAN Modules are available from http://code.google.com/p/dic/wiki/DownloadBuildInstall
# SOAP::Lite version 0.52 or newer is recommended by http://code.google.com/apis/soapsearch/api_faq.html#tech20
use SOAP::Lite;
use Getopt::Long;
use Data::Dumper;

my $VERSION = 0.4.1; # May be required to upload script to CPAN i.e. http://www.cpan.org/scripts/submitting.html

print
    "\n\"Download Indexed Cache\" Proof of Concept (PoC) v$VERSION (Post Google SOAP Search API Deprecation Release)\n";
print "\n";
print "Copyright 2008, 2009, 2010 Christian Heinrich\n";
print "Licensed under the Apache License, Version 2.0\n\n";
print
    "WARNING: You are violating Google's Terms of Service if you execute this post http://googlecode.blogspot.com/2009/08/well-earned-retirement-for-soap-search.html\n\n";

# Take the query from the command line
my $google_api_key;

# TODO Validate the existence of the Google SOAP Search API WSDL file
my $google_api_wsdl_file;
my $query;
my $start;
my $gtos;

# TODO Input Validation of command line arguments
# TODO Include -update command line meta-option
GetOptions(
    "key=s"   => \$google_api_key,
    "wsdl=s"  => \$google_api_wsdl_file,
    "query=s" => \$query,
    "start=s" => \$start,
    "gtos"    => \$gtos
    
    # Command line meta-options 
    # version is excluded as it is printed prior to processing the command line arguments
    # verbose is excluded as output is less then 25 lines
    "usage"   => \$usage,
    "man"     => \$man,  
    "update"  => \$update
);

if (($usage eq 1) or ($man eq 1)) {
    pod2usage(-verbose => 2);
    die();
}

if ($update eq 1) {
    print "Please execute \"svn update\" from the command line\n";
    die();
}

# Process command line arguments
if ($gtos == "0") {
    die("-gtos not specified on the command line - You must acknowledge that you are violating Google's Terms of Service");
}
$start = $start - 1;
chomp($query);

# For demonstrations without exposing the Google SOAP Search API insert your Google SOAP Search API Key below to use dic.pl i.e. -key "demo"
if ( $google_api_key == "demo" ) {

    # Replace "insert_google_api_key" with your Google SOAP Search API Key
    # $google_api_key = "insert_google_api_key";
}

# strip ":" from Google Search Operator for Filename
# TODO Expand this to strip illegal filename chars e.g. \/:*?<>|
my $stripped_query = $query;
$stripped_query =~ s/://g;
my $dir = "$stripped_query/dic";

# The directory which holds the output of dic i.e. ./$query/dic
if ( !( -e $dir ) ) {
    print("Creating ./$dir\n\n");
    if ( !( -e "./$stripped_query" ) ) {
        mkdir("./$stripped_query");
    }
    mkdir("$dir");
}
else { print "Appending ./$dir\n\n"; }

my $google_search_results
    = do_Google_Search( "$google_api_key", "$query", "$start" );

# TODO Display a warning if <estimatedTotalResultsCount> and <estimateIsExact> exceeds 1000

open( DATA_DUMPER, ">>./$dir/datadumper.txt" );
print DATA_DUMPER ( Data::Dumper::Dumper($google_search_results) );

# The URL corresponding to the Search Result .html file is listed in this .CSV file
open( URL, ">>./$dir/$stripped_query.csv" );

my $google_search_result_number = $start;

# Loop through the results.
foreach
    my $google_search_result ( @{ $google_search_results->{resultElements} } )
{

    # Set the results as variables
    ++$google_search_result_number;
    my $URL        = $google_search_result->{URL};
    my $cachedSize = $google_search_result->{cachedSize};
    print(    "Downloading " 
            . $URL
            . " from Google Cache ["
            . $cachedSize . "] as "
            . $google_search_result_number
            . ".html\n" );

    # Remove "#" character from the following 2 through 5 lines and insert "#" character to the line below if your intercepting api.google.com supports the doGetCachedPage SOAP Message
    print
        "WARNING: Sensepost's Aura does not support doGetCachedPage SOAP Message\n";

    # my $google_cached_page = doGetCachedPage( "$google_api_key", "$URL" );
    # open( CACHEDPAGE, ">./$dir/$google_search_result_number.html" );
    # print CACHEDPAGE $google_cached_page;
    # close(CACHEDPAGE);

# TODO Include the date and time the page was indexed i.e. to quote the cache page "It is a snapshot of the page as it appeared on [Date] [Time]"
    print URL ( "$google_search_result_number" . "," . "$URL\n" );
}

sub do_Google_Search {

# Variable Naming Convention is as per Google SOAP Search API Reference Documentation
    my $key = $_[0];

    # $q is Google Search Query from Google SOAP Search API Reference
    # TODO Check length of Google Search Query is 2048 bytes
    # TODO Check Google Search Query is a maximum of 10 Words
    # TODO Check only one site: term is in the Google Search Query
    my $q = $_[1];

    # my $start = -start cmd line argument
    my $start = $_[2];

    # TODO Must add a test to ensure that $maxResults is between 1 to 1000
    my $maxResults = "10";

    # $filter is boolean i.e. either "true" or "false"
    my $filter = "false";

    # TODO Check Country of Restrict
    # TODO Check Topic of Restrict
    my $restricts  = "";
    my $safeSearch = "false";

    # TODO Check Language Restrict
    my $lr = "";

    # ie is Input Encoding and this has been deprecated in the Google SOAP Search API
    my $ie = "UTF-8";

    # oe is Output Encoding and this has been deprecated in the Google SOAP Search API
    my $oe = "UTF-8";

    # Location of the GoogleSearch WSDL file
    my $google_wsdl = "file:" . $google_api_wsdl_file;

    # Create a new SOAP::Lite instance, feeding it GoogleSearch.wsdl
    my $google_search = SOAP::Lite->service("$google_wsdl");

    # TODO Confirm that connection with api.google.com can be established
    my $google_search_results = $google_search->doGoogleSearch(
        $key,       $q,          $start, $maxResults, $filter,
        $restricts, $safeSearch, $lr,    $ie,         $oe
    );

    # TODO Confirm that doGoogleSearchResponse SOAP Message is not empty due to exceeding 10K SOAP Messages with Google SOAP Search API Key
    return $google_search_results;
}

sub doGetCachedPage {

# Variable Naming Convention is as per Google SOAP Search API Reference Documentation

    my $key = $_[0];
    my $URL = $_[1];

    # Location of the GoogleSearch WSDL file
    my $google_wsdl = "file:$google_api_wsdl_file";

    my $google_cache = SOAP::Lite->service("$google_wsdl");
    my $doGetCachedPageResponse
        = $google_cache->doGetCachedPage( $google_api_key, $URL );

    # TODO Confirm that doGetCachedPageResponse SOAP Message is not empty due to exceeding 10K SOAP Messages with Google SOAP Search API Key
    return $doGetCachedPageResponse;
}

=head1 NAME

dic.pl - "Download Indexed Cache"

=head1 VERSION

This documentation refers to dic PoC v0.4 (Post Google SOAP Search API Deprecation Release).

=head1 USAGE

dic.pl -key [key] -query [Google Search Query] -start [Starting Google Search Result Number]

=head1 REQUIRED ARGUMENTS

 -key           Google SOAP Search API Key or "demo"
 -wsdl          Google SOAP Search API WSDL file i.e. GoogleSearch.wsdl
 -q             Google Search Query
 -start         Starting Google Search Result Number
 -gtos          Acknowledgement of violation of Google's Terms of Service
 
 =head1 OPTIONAL ARGUEMENTS

 -man		Displays POD and exits.
 -usage		Displays POD and exits.
 -update	Displays the svn command to retrieve the latest update from code.google.com

=head1 DESCRIPTION

"Download Indexed Cache" implements the Google SOAP Search API to retrieve
content indexed within the Google Cache and supports the "Search Engine
Reconnaissance" section of the recently released OWASP Testing Guide v3.

WARNING: You are violating Google's Terms of Service if you execute this post
http://googlecode.blogspot.com/2009/08/well-earned-retirement-for-soap-search.html

=head1 DEPENDENCIES

=head1 PREREQUISITES

Instructions for CPAN Modules are available from
http://code.google.com/p/dic/wiki/DownloadBuildInstall

SOAP::Lite v0.52 CPAN Module
Data::Dumper CPAN Module

=head1 COREQUISITES

=head1 OSNAMES

cygwin

=head1 SCRIPT CATEGORIES

Web

=head1 INCOMPATIBILITIES

=head1 BUGS AND LIMITATIONS

Please refer to the comments beginning with "TODO" in the Perl Code.

Aura does not offer support for the doGetCachedPage SOAP Message .

=head1 AUTHOR

Christian Heinrich

=head1 CONTACT INFORMATION

christian.heinrich@owasp.org
christian.heinrich@cmlh.id.au
cmlh@cpan.org

http://www.linkedin.com/in/ChristianHeinrich

=head1 MAILING LIST

https://lists.owasp.org/mailman/listinfo/owasp-google-hacking
http://groups.google.com/group/download-indexed-cache

=head1 SUBVERSION REPOSITORY

http://code.google.com/p/dic

=head1 FURTHER INFORMATION AND UPDATES

http://www.owasp.org/index.php/Category:OWASP_Google_Hacking_Project
http://code.google.com/p/dic
http://downloadindexedcache.blogspot.com/
http://del.icio.us/cmlh/dic

=head1 LICENSE AND COPYRIGHT

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Copyright 2008, 2009, 2010 Christian Heinrich
